suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/samples_final.csv"))
## Parsed with column specification:
## cols(
## ScilifeID = col_character(),
## SubmittedID = col_character(),
## Stages = col_character(),
## Description = col_character(),
## ID = col_character()
## )
filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]
stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
samples$ScilifeID) == 1:nrow(samples)))
name the file list vector
names(filelist) <- samples$ID
Read the expression at the gene level
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",
txOut=TRUE)$counts))
combine technical replicates
samples$ID <- sub("_L00[1,2]", "",
samples$ScilifeID)
counts <- do.call(
cbind,
lapply(split.data.frame(t(counts),
samples$ID),
colSums))
csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "19.8% percent (13135) of 66360 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(csamples)
ggplot(dat,aes(x,y,fill=csamples$Stages)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 13135 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by Stages.
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Stages=csamples$Stages[match(ind,csamples$ID)])
ggplot(dat,aes(x=values,group=ind,col=Stages)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 622137 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_genes+TEs.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
csamples$Stages <- factor(csamples$Stages)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = csamples,
design = ~Stages)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds_genes+TEs.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P7614_301_S1 | P7614_302_S2 | P7614_303_S3 | P7614_304_S4 | P7614_305_S5 |
|---|---|---|---|---|
| 0.997 | 0.9789 | 1.096 | 1.053 | 1.102 |
| P7614_306_S6 | P7614_307_S7 | P7614_308_S8 | P7614_309_S9 | P7614_310_S10 |
|---|---|---|---|---|
| 1.311 | 1.091 | 1.167 | 0.9277 | 0.8292 |
| P7614_311_S11 | P7614_312_S12 | P7614_313_S13 | P7614_314_S14 | P7614_315_S15 |
|---|---|---|---|---|
| 0.918 | 0.8462 | 1.007 | 0.9453 | 1.102 |
| P7614_316_S16 | P7614_317_S17 | P7614_318_S18 | P7614_319_S19 | P7614_320_S20 |
|---|---|---|---|---|
| 1.01 | 0.9052 | 0.9999 | 0.9703 | 0.9241 |
| P7614_321_S21 | P7614_322_S22 | P7614_323_S23 | P7614_324_S24 |
|---|---|---|---|
| 1.119 | 1.072 | 0.9857 | 0.9988 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
save(vst,file=here("data/analysis/DE/vst-blind_genes+TEs.rda"))
vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware_genes+TEs.rda"))
# prepare the data to build the network
#ID <- rownames(vsta)
#vsta <- cbind(ID,vsta)
#vsta_tibble <- as_tibble(vsta)
#write_tsv(vsta_tibble,path=here("data/analysis/DE/vst-aware_genes+TEs.tsv"))
The variance stabilisation worked adequately
meanSdPlot(vsta[rowSums(vsta)>0,])
pc <- prcomp(t(vsta))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=1
An the number of possible combinations
nlevel=nlevels(dds$Stages)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
csamples)
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=dds$Stages,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise
conds <- factor(csamples$Stages)
sels <- rangeFeatureSelect(counts=vsta,
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
vst.cutoff <- 1
hm <- heatmap.2(t(scale(t(vsta[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds)
#load(here("data/analysis/DE/vst-aware_genes+TEs.rda"))
TEs <- vsta[grepl("^MA_", rownames(vsta)) == FALSE, ]
pc_TEs <- prcomp(t(vsta[grepl("^MA_", rownames(vsta)) == FALSE, ]))
percent_TEs <- round(summary(pc_TEs)$importance[2,]*100)
pc.dat_TEs <- bind_cols(PC1=pc_TEs$x[,1],
PC2=pc_TEs$x[,2],
csamples)
p <- ggplot(pc.dat_TEs,aes(x=PC1,y=PC2,col=dds$Stages,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent_TEs[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent_TEs[2],"%)",sep="")))
Filter for noise
conds_TEs <- factor(csamples$Stages)
sels_TEs <- rangeFeatureSelect(counts=TEs,
conditions=conds_TEs,
nrep=3)
vst.cutoff <- 1
hm <- heatmap.2(t(scale(t(TEs[sels_TEs[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds_TEs,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds_TEs)
hm2 <- heatmap.2(TEs,
scale = "row",
labRow = NULL,
labCol = conds_TEs,
trace = "none",
col=hpal)
plot(as.hclust(hm2$colDendrogram),xlab="",sub="",labels=conds_TEs)
# The Biological QA is good.
# The sequencing depth is good. Also looking at the PCAs we don't have any outliers.
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.56.0 tximport_1.16.1
## [3] forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.0 purrr_0.3.4
## [7] readr_1.3.1 tidyr_1.1.0
## [9] tibble_3.0.1 tidyverse_1.3.0
## [11] plotly_4.9.2.1 pander_0.6.3
## [13] hyperSpec_0.99-20200527 xml2_1.3.2
## [15] ggplot2_3.3.2 lattice_0.20-41
## [17] here_0.1 gplots_3.0.3
## [19] DESeq2_1.28.1 SummarizedExperiment_1.18.1
## [21] DelayedArray_0.14.0 matrixStats_0.56.0
## [23] Biobase_2.48.0 GenomicRanges_1.40.0
## [25] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [27] S4Vectors_0.26.1 BiocGenerics_0.34.0
## [29] data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.1 rprojroot_1.3-2
## [4] XVector_0.28.0 fs_1.4.1 rstudioapi_0.11
## [7] hexbin_1.28.1 farver_2.0.3 affyio_1.58.0
## [10] bit64_0.9-7 AnnotationDbi_1.50.0 fansi_0.4.1
## [13] lubridate_1.7.9 splines_4.0.0 geneplotter_1.66.0
## [16] knitr_1.29 jsonlite_1.7.0 Cairo_1.5-12
## [19] broom_0.5.6 annotate_1.66.0 dbplyr_1.4.4
## [22] png_0.1-7 BiocManager_1.30.10 compiler_4.0.0
## [25] httr_1.4.1 backports_1.1.8 assertthat_0.2.1
## [28] Matrix_1.2-18 lazyeval_0.2.2 limma_3.44.3
## [31] cli_2.0.2 htmltools_0.5.0 tools_4.0.0
## [34] gtable_0.3.0 glue_1.4.1 GenomeInfoDbData_1.2.3
## [37] affy_1.66.0 Rcpp_1.0.4.6 cellranger_1.1.0
## [40] vctrs_0.3.1 preprocessCore_1.50.0 gdata_2.18.0
## [43] nlme_3.1-148 crosstalk_1.1.0.1 xfun_0.15
## [46] testthat_2.3.2 rvest_0.3.5 lifecycle_0.2.0
## [49] gtools_3.8.2 XML_3.99-0.3 zlibbioc_1.34.0
## [52] scales_1.1.1 hms_0.5.3 RColorBrewer_1.1-2
## [55] yaml_2.2.1 memoise_1.1.0 latticeExtra_0.6-29
## [58] stringi_1.4.6 RSQLite_2.2.0 highr_0.8
## [61] genefilter_1.70.0 caTools_1.18.0 BiocParallel_1.22.0
## [64] rlang_0.4.6 pkgconfig_2.0.3 bitops_1.0-6
## [67] evaluate_0.14 labeling_0.3 htmlwidgets_1.5.1
## [70] bit_1.1-15.2 tidyselect_1.1.0 magrittr_1.5
## [73] R6_2.4.1 generics_0.0.2 DBI_1.1.0
## [76] pillar_1.4.4 haven_2.3.1 withr_2.2.0
## [79] survival_3.2-3 RCurl_1.98-1.2 modelr_0.1.8
## [82] crayon_1.3.4 KernSmooth_2.23-17 rmarkdown_2.3
## [85] jpeg_0.1-8.1 locfit_1.5-9.4 readxl_1.3.1
## [88] blob_1.2.1 reprex_0.3.0 digest_0.6.25
## [91] xtable_1.8-4 munsell_0.5.0 viridisLite_0.3.0